suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
Settings
data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'
wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)
figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Metagene_RNA/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/Metagene_RNA/')
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
Functions
paste_wd <- function(path) {
paste0(wd, path)
}
calc_base_position <- function(df) {
df |>
mutate(transcript_seq = str_split(transcript_seq, '')) |>
unnest(transcript_seq) |>
group_by(transcript_id) |>
mutate(position = row_number() - min(row_number()) + 1) |>
ungroup() |>
dplyr::rename(base = transcript_seq)
}
calc_relposition_in_RNA <- function(df) {
df |>
dplyr::rename(kmer_middle = position) |>
left_join(DRS_methylated_RNAs_annotation) |>
mutate(rel_kmer_middle = kmer_middle / length)
}
calc_CC_position <- function(df) {
df |>
mutate(position = str_locate_all(transcript_seq, 'CC')) |>
unnest(position) |>
mutate(position = (position[,1] + position[,2]) / 2) |>
select(transcript_id, position)
}
plot_metagene_distribution_different_adjustment <- function(adjust_value) {
metagene_plot <-
rel_position_allC_m3C |>
filter(!is.na(genetype2)) |>
ggplot(aes(x = rel_kmer_middle, colour = type)) +
geom_density(adjust = adjust_value) +
facet_wrap( ~ genetype2, ncol = 1, scales = 'free') +
scale_color_manual(values = c('gray', 'blue', 'red'))
metagene_plot |>
ggsave_multiple_formats(
basename = paste0('metageneplot_RNAs_groupedby_type_adjust_', adjust_value),
outdir = figdir, width = 4, height = 12, fontsize = 7
)
}
Read data
Methylated positions
DRS_methylated_positions <-
read_tsv(
'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv' |>
paste_wd()
)
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_RNA/DRS_methylated_positions_2024-07-29.tsv
## # A tibble: 489 × 13
## transcript_id gene_name seqname gene_type ref_kmer kmer_start kmer_end
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 ENST00000429711.7 RPL32 chr3 protein_cod… GCCCA 423 427
## 2 ENST00000647248.2 RPL35A chr3 protein_cod… ACCCC 381 385
## 3 ENST00000647248.2 RPL35A chr3 protein_cod… CCCCT 382 386
## 4 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCG 58 62
## 5 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCT 76 80
## 6 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ATCAA 94 98
## 7 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA GCCAC 149 153
## 8 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA ACCCC 154 158
## 9 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCC 155 159
## 10 ENST00000389680.2 MT-RNR1 chrM Mt_rRNA CCCCA 156 160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## # rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
Transcript sequences
espresso_transcript_seqs <-
read_tsv(
'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |>
paste_wd()
) |>
select(-transcript_length)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_transcript_seqs
## # A tibble: 36,717 × 2
## transcript_id transcript_seq
## <chr> <chr>
## 1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
## 2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
## 3 ENST00000420393.5 CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGGCAGAGTTGGTGGCGTGAG…
## 4 ENST00000698415.1 GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTCTGCTAGCCAAAGACCAAC…
## 5 ENST00000698416.1 CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTCACACACTAACCTTTTTAA…
## 6 ENST00000488263.5 AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTTATCTTTCTTGGGATTCTA…
## 7 ENST00000424814.5 GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCTGCCGCTCCTGCCTGCAG…
## 8 ENST00000231948.9 AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCT…
## 9 ENST00000432408.6 GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGC…
## 10 ENST00000459840.5 ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTGATTAAATTGCTACCAGTG…
## # ℹ 36,707 more rows
List of methylated RNAs
DRS_methylated_RNAs <-
DRS_methylated_positions |>
select(transcript_id) |>
distinct()
DRS_methylated_RNAs
## # A tibble: 71 × 1
## transcript_id
## <chr>
## 1 ENST00000429711.7
## 2 ENST00000647248.2
## 3 ENST00000389680.2
## 4 ENST00000361390.2
## 5 ENST00000361453.3
## 6 ENST00000387347.2
## 7 ENST00000361624.2
## 8 ENST00000361739.1
## 9 ENST00000361899.2
## 10 ENST00000361227.2
## # ℹ 61 more rows
Annotation of the methylated RNAs
DRS_methylated_RNAs_annotation <-
DRS_methylated_positions |>
select(starts_with('gene'), starts_with('transcript'), length) |>
distinct()
DRS_methylated_RNAs_annotation
## # A tibble: 71 × 5
## gene_name gene_type genetype2 transcript_id length
## <chr> <chr> <chr> <chr> <dbl>
## 1 RPL32 protein_coding mRNA ENST00000429711.7 2094
## 2 RPL35A protein_coding mRNA ENST00000647248.2 1234
## 3 MT-RNR1 Mt_rRNA Mt_rRNA ENST00000389680.2 954
## 4 MT-ND1 protein_coding mt-mRNA ENST00000361390.2 956
## 5 MT-ND2 protein_coding mt-mRNA ENST00000361453.3 1042
## 6 MT-RNR2 Mt_rRNA Mt_rRNA ENST00000387347.2 1559
## 7 MT-CO1 protein_coding mt-mRNA ENST00000361624.2 1542
## 8 MT-CO2 protein_coding mt-mRNA ENST00000361739.1 684
## 9 MT-ATP6 protein_coding mt-mRNA ENST00000361899.2 681
## 10 MT-ND3 protein_coding mt-mRNA ENST00000361227.2 346
## # ℹ 61 more rows
Prepare dataframe of base positions in the methylated RNAs
methylated_RNAs_base_positions <-
espresso_transcript_seqs |>
right_join(DRS_methylated_RNAs) |>
calc_base_position()
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_base_positions
## # A tibble: 101,437 × 3
## transcript_id base position
## <chr> <chr> <dbl>
## 1 ENST00000429711.7 A 1
## 2 ENST00000429711.7 G 2
## 3 ENST00000429711.7 C 3
## 4 ENST00000429711.7 C 4
## 5 ENST00000429711.7 C 5
## 6 ENST00000429711.7 T 6
## 7 ENST00000429711.7 T 7
## 8 ENST00000429711.7 G 8
## 9 ENST00000429711.7 C 9
## 10 ENST00000429711.7 G 10
## # ℹ 101,427 more rows
CC positions
methylated_RNAs_CC_positions <-
espresso_transcript_seqs |>
right_join(DRS_methylated_RNAs) |>
calc_CC_position() |>
calc_relposition_in_RNA() |>
mutate(type = 'all CC')
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CC_positions
## # A tibble: 5,498 × 8
## transcript_id kmer_middle gene_name gene_type genetype2 length
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 ENST00000429711.7 3.5 RPL32 protein_coding mRNA 2094
## 2 ENST00000429711.7 13.5 RPL32 protein_coding mRNA 2094
## 3 ENST00000429711.7 16.5 RPL32 protein_coding mRNA 2094
## 4 ENST00000429711.7 20.5 RPL32 protein_coding mRNA 2094
## 5 ENST00000429711.7 32.5 RPL32 protein_coding mRNA 2094
## 6 ENST00000429711.7 43.5 RPL32 protein_coding mRNA 2094
## 7 ENST00000429711.7 59.5 RPL32 protein_coding mRNA 2094
## 8 ENST00000429711.7 65.5 RPL32 protein_coding mRNA 2094
## 9 ENST00000429711.7 82.5 RPL32 protein_coding mRNA 2094
## 10 ENST00000429711.7 85.5 RPL32 protein_coding mRNA 2094
## # ℹ 5,488 more rows
## # ℹ 2 more variables: rel_kmer_middle <dbl>, type <chr>
Join data
rel_position_allC_m3C <-
methylated_RNAs_C_positions |>
calc_relposition_in_RNA() |>
mutate(type = 'allC') |>
bind_rows(methylated_RNAs_CC_positions) |>
bind_rows(DRS_methylated_positions |> mutate(type = 'm3C'))
## Joining with `by = join_by(transcript_id)`
rel_position_allC_m3C
## # A tibble: 30,104 × 15
## transcript_id base kmer_middle gene_name gene_type genetype2 length
## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 ENST00000429711.7 C 3 RPL32 protein_coding mRNA 2094
## 2 ENST00000429711.7 C 4 RPL32 protein_coding mRNA 2094
## 3 ENST00000429711.7 C 5 RPL32 protein_coding mRNA 2094
## 4 ENST00000429711.7 C 9 RPL32 protein_coding mRNA 2094
## 5 ENST00000429711.7 C 11 RPL32 protein_coding mRNA 2094
## 6 ENST00000429711.7 C 13 RPL32 protein_coding mRNA 2094
## 7 ENST00000429711.7 C 14 RPL32 protein_coding mRNA 2094
## 8 ENST00000429711.7 C 16 RPL32 protein_coding mRNA 2094
## 9 ENST00000429711.7 C 17 RPL32 protein_coding mRNA 2094
## 10 ENST00000429711.7 C 20 RPL32 protein_coding mRNA 2094
## # ℹ 30,094 more rows
## # ℹ 8 more variables: rel_kmer_middle <dbl>, type <chr>, seqname <chr>,
## # ref_kmer <chr>, kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## # rel_kmer_end <dbl>
rel_position_allC_m3C |>
export_tsv(outdir = tabledir, compression = 'gz')
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_RNA/rel_position_allC_m3C_2024-07-29.tsv.gz
## # A tibble: 30,104 × 15
## transcript_id base kmer_middle gene_name gene_type genetype2 length
## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 ENST00000429711.7 C 3 RPL32 protein_coding mRNA 2094
## 2 ENST00000429711.7 C 4 RPL32 protein_coding mRNA 2094
## 3 ENST00000429711.7 C 5 RPL32 protein_coding mRNA 2094
## 4 ENST00000429711.7 C 9 RPL32 protein_coding mRNA 2094
## 5 ENST00000429711.7 C 11 RPL32 protein_coding mRNA 2094
## 6 ENST00000429711.7 C 13 RPL32 protein_coding mRNA 2094
## 7 ENST00000429711.7 C 14 RPL32 protein_coding mRNA 2094
## 8 ENST00000429711.7 C 16 RPL32 protein_coding mRNA 2094
## 9 ENST00000429711.7 C 17 RPL32 protein_coding mRNA 2094
## 10 ENST00000429711.7 C 20 RPL32 protein_coding mRNA 2094
## # ℹ 30,094 more rows
## # ℹ 8 more variables: rel_kmer_middle <dbl>, type <chr>, seqname <chr>,
## # ref_kmer <chr>, kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## # rel_kmer_end <dbl>
Num sites
rel_position_allC_m3C |>
group_by(type, genetype2) |>
reframe(n = n())
## # A tibble: 12 × 3
## type genetype2 n
## <chr> <chr> <int>
## 1 all CC Mt_rRNA 156
## 2 all CC mRNA 4608
## 3 all CC mt-mRNA 713
## 4 all CC <NA> 21
## 5 allC Mt_rRNA 650
## 6 allC mRNA 20548
## 7 allC mt-mRNA 2815
## 8 allC <NA> 104
## 9 m3C Mt_rRNA 47
## 10 m3C mRNA 257
## 11 m3C mt-mRNA 182
## 12 m3C <NA> 3
Plot
c(1/10, 1/5, 1/2, 1, 2, 5, 10) |>
map(plot_metagene_distribution_different_adjustment)
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]
